library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────── tidyverse 1.2.1.9000 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.99.9000 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(corrplot)
## corrplot 0.84 loaded
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(tmap) # GET DEV VERSION! ("view" mode issues)
NOTE: You must have (very) recently installed tidyr from CRAN for pivot_longer() and pivot_wider() to work, and you must load the package in addition to the tidyverse. That’s because the newest version of tidyr is not included in the most recent tidyverse release, so those updates aren’t included in the overall tidyverse (but are in CRAN tidyr now).
Here, we’ll use the world_bank_pop dataset (World Bank population data) from tidyr. Notice that it is in wide format (not tidy) - each year has its own column, when really there should be a single column ‘year’. We’ll use pivot_longer() to make it so.
Note: indicator SP.POP.TOTL = total population (that’s the variable we’ll explore today)
View(world_bank_pop) (and ask yourself - why wouldn’t you want to include the View() function in a code chunk that is then knit?)Let’s do some wrangling:
pop_long <- world_bank_pop %>%
filter(indicator == "SP.POP.TOTL") %>%
select(-indicator) %>%
pivot_longer(cols = '2000':'2017', # Note '' here; can also be "" or ``
names_to = "year",
values_to = "population") %>% # Names still "character", so...
mutate(year = as.numeric(year))
# Let's calculate the average growth rate for each country (slope):
pop_rates <- pop_long %>%
group_by(country) %>%
summarize(
rate = (max(population) - min(population))/(max(year) - min(year))
) %>%
filter(country %in% c("USA","MEX","CAN"))
# COOL, pivot_longer is much more intuitive than gather() and I strongly recommend getting new tidyr
mw_num <- midwest %>%
select(poptotal:percbelowpoverty) %>%
cor()
corrplot::corrplot(mw_num)
# Or, some other options (see ?corrplot):
corrplot(mw_num,
type = "upper",
method = "ellipse")
Load the data ‘slo_homes.csv’ (after dropping the file into the project folder)
slo_homes <- read_csv("slo_homes.csv")
## Parsed with column specification:
## cols(
## City = col_character(),
## Price = col_double(),
## Bedrooms = col_double(),
## Bathrooms = col_double(),
## SqFt = col_double(),
## PricePerSqFt = col_double(),
## Status = col_character()
## )
Keep only data for ‘San Luis Obispo,’ ‘Arroyo Grande,’ ‘Atascadero’, ‘Santa Maria-Orcutt’
home_df <- slo_homes %>%
filter(City %in% c("San Luis Obispo", "Arroyo Grande", "Atascadero", "Santa Maria-Orcutt"))
Prepare models (again - you will be responsible for conceptual understanding, checking assumptions, running diagnostics, etc.). This is just how it’s done in R.
# Run the complete model
home_lm <- lm(Price ~ SqFt + Bedrooms + Bathrooms + Status + City, data = home_df)
# And, just as an example, what if we remove 'Bedrooms'
home_lm2 <- lm(Price ~ SqFt + Bathrooms + Status + City, data = home_df)
# Check out the results
summary(home_lm)
##
## Call:
## lm(formula = Price ~ SqFt + Bedrooms + Bathrooms + Status + City,
## data = home_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1058160 -66637 3405 67836 3857622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 189743.34 64528.62 2.940 0.00347 **
## SqFt 258.91 24.71 10.477 < 2e-16 ***
## Bedrooms -61749.96 18674.09 -3.307 0.00103 **
## Bathrooms 36058.56 22939.04 1.572 0.11677
## StatusRegular 208261.90 49074.78 4.244 2.74e-05 ***
## StatusShort Sale -3975.09 32643.17 -0.122 0.90314
## CityAtascadero -199864.94 48835.17 -4.093 5.18e-05 ***
## CitySan Luis Obispo 13904.28 70820.48 0.196 0.84445
## CitySanta Maria-Orcutt -260860.77 43072.43 -6.056 3.25e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 240300 on 394 degrees of freedom
## Multiple R-squared: 0.5468, Adjusted R-squared: 0.5376
## F-statistic: 59.41 on 8 and 394 DF, p-value: < 2.2e-16
summary(home_lm2)
##
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Status + City, data = home_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1054065 -66720 7094 63253 3937520
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99658.68 59226.90 1.683 0.0932 .
## SqFt 236.86 24.09 9.831 < 2e-16 ***
## Bathrooms 10694.72 21888.79 0.489 0.6254
## StatusRegular 209850.13 49685.69 4.224 2.99e-05 ***
## StatusShort Sale 1099.85 33014.57 0.033 0.9734
## CityAtascadero -204805.56 49422.32 -4.144 4.18e-05 ***
## CitySan Luis Obispo 29889.04 71538.29 0.418 0.6763
## CitySanta Maria-Orcutt -278132.37 43288.87 -6.425 3.80e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 243300 on 395 degrees of freedom
## Multiple R-squared: 0.5342, Adjusted R-squared: 0.5259
## F-statistic: 64.71 on 7 and 395 DF, p-value: < 2.2e-16
# What are the reference levels?
# Do these coefficients make sense?
# What do the diagnostics tell us (using plot)?
# There is one variable coefficient that is actually very unexpected here. Which is it?
Create output tables using stargazer:
stargazer(home_lm, home_lm2, type = "html",
title = "My awesome title!",
dep.var.caption = "A little column caption...",
dep.var.labels = "Model",
digits = 2)
| A little column caption… | ||
| Model | ||
| (1) | (2) | |
| SqFt | 258.91*** | 236.86*** |
| (24.71) | (24.09) | |
| Bedrooms | -61,749.96*** | |
| (18,674.09) | ||
| Bathrooms | 36,058.56 | 10,694.72 |
| (22,939.04) | (21,888.79) | |
| StatusRegular | 208,261.90*** | 209,850.10*** |
| (49,074.78) | (49,685.69) | |
| StatusShort Sale | -3,975.09 | 1,099.85 |
| (32,643.17) | (33,014.57) | |
| CityAtascadero | -199,864.90*** | -204,805.60*** |
| (48,835.17) | (49,422.32) | |
| CitySan Luis Obispo | 13,904.28 | 29,889.04 |
| (70,820.48) | (71,538.29) | |
| CitySanta Maria-Orcutt | -260,860.80*** | -278,132.40*** |
| (43,072.43) | (43,288.87) | |
| Constant | 189,743.30*** | 99,658.68* |
| (64,528.62) | (59,226.90) | |
| Observations | 403 | 403 |
| R2 | 0.55 | 0.53 |
| Adjusted R2 | 0.54 | 0.53 |
| Residual Std. Error | 240,332.70 (df = 394) | 243,336.10 (df = 395) |
| F Statistic | 59.41*** (df = 8; 394) | 64.71*** (df = 7; 395) |
| Note: | p<0.1; p<0.05; p<0.01 | |
We’ll use read_sf to read in entire layers of spatial data (different files do different things: geometries, attributes, metadata, projections, etc.). We want to read them all in at once. Easy with sf!
First, let’s explore fire hazard zones in SB county (data fhszs06_3_42):
firezone <- read_sf(dsn = ".", layer = "fhszs06_3_42") %>% # Cool.
clean_names()
# Do some basic exploring (may want to do this in the console...)
# View(firezone)
# st_crs(firezone) - no EPSG code (we'll set one)
# Wrangle to only include the haz_class
fire_haz <- firezone %>%
select(haz_class) %>% # Notice that the geometries are still there!
st_transform(crs = 4326) # Cool
# Base plot:
plot(fire_haz)
But we probably want it to have some context, and possibly even some interactivity!
fire_map <- tm_shape(fire_haz) +
tm_polygons("haz_class",
style = "quantile",
title = "Santa Barbara Fire Hazard Severity",
alpha = 0.5)
fire_map
# But let's set it to interactive viewing mode and try again!
tmap_mode("view")
## tmap mode set to interactive viewing
fire_map
# Even knit it and open the HMTL....oooooo aaaaaaa
Get the entire layer for california_county_shape_file:
ca_counties <- read_sf(dsn = ".", layer = "california_county_shape_file")# DON'T USE THIS...just to make it quicker for today (lowers resolution, show failure example at dTolerance = 10000, for example)
# Set crs (doesn't come with projection info...so use st_crs)
st_crs(ca_counties) = 4326
Then wrangle it a bit, and see a base R plot:
# Get a cleaned up version:
ca_clean <- ca_counties %>%
clean_names() %>%
select(name, area)
plot(ca_clean)
Now make a better plot with tmap…(will stay in “view” mode until you change it back…)
# Or with tmap:
ca_map <- tm_shape(ca_clean) +
tm_fill("area") +
tm_style("natural")
ca_map
# And you can also pick your own ESRI basemap!
# http://leaflet-extras.github.io/leaflet-providers/preview/ (or use ?leaflet in Console, and click on the link in ‘server’ to leaflet extras)
# Like this:
ca_watercolor <- tm_basemap("Stamen.Watercolor") +
tm_shape(ca_clean) +
tm_fill("area")
ca_watercolor
ggplot(fire_haz) +
geom_sf(aes(fill = haz_class), color = NA) +
scale_fill_manual(values = c("orange","yellow","red"))
Get the state fault line data:
faults <- read_sf(dsn = ".", layer = "GMC_str_arc") %>%
clean_names() %>%
select(ltype) %>%
st_transform(crs = 4326)
plot(faults)
And we can overlay those on ca_counties:
fault_map <- tm_basemap("CartoDB.DarkMatter") +
tm_shape(ca_clean) +
tm_polygons(alpha = 0.2,
fill = "gray10",
border.col = "gray80") +
tm_shape(faults) +
tm_lines(col = "yellow")
fault_map
But what if we’re only interested in fault lines within SB county? Then we’ll want to clip our spatial data based on another polygon:
First, just get the spatial information for SB County (from ca_clean):
sb_polygon <- ca_clean %>%
filter(name == "Santa Barbara") # Ask: why are there multiple polygons for a single county? Islaaaaands
Then, clip the fault lines data, bounded by sb_polygon
fault_clip <- st_intersection(faults, sb_polygon)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant
## throughout all geometries
# Check it out:
# plot(fault_clip)
Then let’s just plot the fault lines in SB:
faults_sb <- tm_basemap("Esri.NatGeoWorldMap") +
tm_shape(sb_polygon) +
tm_polygons(col = "black",
alpha = 0.2,
border.col = "gray10") +
tm_shape(fault_clip) +
tm_lines(col = "white")
faults_sb